We first defined the working directory, load the required library and load the transaction dataset.
# Set Working Directory
setwd("~/SANDY/web_analytics/1_CA1/CA1_r11/3_Sequential_Rules_Analysis")
#Load Required Libraries
library(arulesViz)
library(dplyr)
library(datetime)
library(arulesSequences)
#Load Required data
dt1 = read.csv(file="train.csv")
The following is the first 5 rows of the dataset.
dt1[1:5,]
The following is the Basic Summary of the dataset.
summary(dt1)
session_id datetime_y item_id category
Min. : 11 9/14/2014 11:12:15: 6 Min. :214507331 Min. : 0.000
1st Qu.: 1319978 4/13/2014 18:02:47: 5 1st Qu.:214716973 1st Qu.: 0.000
Median : 8998744 8/31/2014 18:27:41: 5 Median :214839313 Median :14.000
Mean : 6867169 4/12/2014 16:43:56: 4 Mean :214786898 Mean : 7.634
3rd Qu.:10209851 4/13/2014 17:51:10: 4 3rd Qu.:214853154 3rd Qu.:14.000
Max. :11562121 4/13/2014 9:52:22 : 4 Max. :214990340 Max. :14.000
(Other) :346048
Recode Event Id to numeric ascending order.
df$eventID <- df$eventID[1] <- 1
for (i in 1:length(df$sequenceID)) {
if (i == 1) {df$eventID[i] <- 1} else
if( df$sequenceID[i-1] == df$sequenceID[i])
{
df$eventID[i] <-df$eventID[i-1]+1
}
}
df[1:5,]
df1 <- df
# Check dummy columns
df1$seq_test <- df1$sequenceID
df1$sequenceID <-df1$sequenceID[1] <- 1
df1[1:5,]
Recode Sequence Id to numeric ascending order.
for (i in 1:length(df1$seq_test)) {
if (i == 1) {df$sequenceID[i] <- 1} else
if(df1$seq_test[i-1] == df1$seq_test[i]){ df1$sequenceID[i] = df1$sequenceID[i-1] }
else {df1$sequenceID[i] = df1$sequenceID[i-1]+1}
}
df1[1:5,]
Export the data out as .txt files and re-construct the Transaction Basket file.
write.table(df2, "seq_format.txt", sep=" ", row.names = FALSE, col.names = FALSE, quote = FALSE)
data <- read_baskets(con = "seq_format.txt", info = c("sequenceID","eventID","SIZE"))
Show the Sequences Rules.
as(head(data), "data.frame")
Run CSpade Algorithm.
For CSAPDE algoritm you might set some lags so that you can extract rules from sequence of transactions with the lag.
We set the minimum support of rules to 0.5%.
seqs <- cspade(data, parameter = list(support = 0.0005), control = list(verbose = TRUE))
parameter specification:
support : 5e-04
maxsize : 10
maxlen : 10
algorithmic control:
bfstype : FALSE
verbose : TRUE
summary : FALSE
tidLists : FALSE
preprocessing ... 2 partition(s), 7.27 MB [0.48s]
mining transactions ... 0.02 MB [0.16s]
reading sequences ... [0.15s]
total elapsed time: 0.784s
View the Sequences.
as(seqs,"data.frame") # view the sequences
Convert extracted sequential rules to data frame and Filter rules with more than one sequence
scrul.dt <- as(seqs,"data.frame")
scrul.dt$sequence <- gsub("df3\\$cart2\\=|<|>","",scrul.dt$sequence)
scrul.dt1 <- scrul.dt[count.fields(textConnection(scrul.dt$sequence),sep = ",")>1,]
scrul.dt1
scrul.dt1[10,]
Each of unique sequences happened on the same date. For rule 10,If a customer’s first purchase is 214853102, his second purchase would be 214854840 which is frequent for around 2% of session user.
Induced the Sequences Rules.
seqrules <- ruleInduction(seqs, confidence = 0.5,control = list(verbose = TRUE))
processing ... 1271 itemsets, 876 rules [0.00s]
The following is the Sequence Rules with 50% Confidences.
as(seqrules,"data.frame") # view the rules
Testing Sequence Rules
We first defined the Working Functions and load the Test dataset
#remove duplicate items from a basket (itemstrg)
uniqueitems <- function(itemstrg) {
unique(as.list(strsplit(gsub(" ","",itemstrg),","))[[1]])
}
# execute ruleset using item as rule antecedent (handles single item antecedents only)
makepreds <- function(item, rulesDF) {
antecedent = paste("<{",item,"}> =>",sep="") # NOTE: diff from assoc analysis same fn
firingrules = rulesDF[grep(antecedent, rulesDF$rule,fixed=TRUE),1] # rules is now rule
#gsub(" ","",toString(sub(">}","",sub(".*=> <{","",firingrules))))
gsub(" ", "", toString(sub('\\}>', '', sub(".*=> <\\{", "", firingrules))))
}
# count how many predictions are in the basket of items already seen by that user
# Caution : refers to "baskets" as a global
checkpreds <- function(preds, baskID) {
plist = preds[[1]]
blist = baskets[baskets$basketID == baskID,"items"][[1]]
cnt = 0
for (p in plist) {
if (p %in% blist) cnt = cnt+1
}
cnt
}
# count all predictions made
countpreds <- function(predlist) {
len = length(predlist)
if (len > 0 && (predlist[[1]] == "")) 0 # avoid counting an empty list
else len
}
# Load the test data
testegs = read.csv(file="test.csv")
testegs = testegs[,c(1,3)]
colnames(testegs) <- c("basketID","items") # set standard names, in case they are different in the data file
# Display top 5 rows
testegs[1:5,]
#execute rules against test data
rulesDF = as(seqrules,"data.frame")
testegs$preds = apply(testegs,1,function(X) makepreds(X["items"], rulesDF))
# extract unique predictions for each test user
userpreds = as.data.frame(aggregate(preds ~ basketID, data = testegs, paste, collapse=","))
userpreds$preds = apply(userpreds,1,function(X) uniqueitems(X["preds"]))
# extract unique items for each test user
baskets = as.data.frame(aggregate(items ~ basketID, data = testegs, paste, collapse=","))
baskets$items = apply(baskets,1,function(X) uniqueitems(X["items"]))
#count how many unique predictions made are correct, i.e. have previously been bought (or rated highly) by the user
correctpreds = sum(apply(userpreds,1,function(X) checkpreds(X["preds"],X["basketID"])))
# count total number of unique predictions made
totalpreds = sum(apply(userpreds,1,function(X) countpreds(X["preds"][[1]])))
precision = correctpreds*100/totalpreds
cat("precision=", precision, "corr=",correctpreds,"total=",totalpreds)
precision= 93.81443 corr= 182 total= 194
---
title: "R NOTEBOOK - **Sequential Rules Analysis**"
output: html_notebook
---

We first defined the working directory, load the required library and load the transaction dataset.   
```{r}
# Set Working Directory
setwd("~/SANDY/web_analytics/1_CA1/CA1_r11/3_Sequential_Rules_Analysis")

#Load Required Libraries
library(arulesViz)
library(dplyr)
library(datetime)
library(arulesSequences)

#Load Required data 
dt1 = read.csv(file="train.csv")

```

#### The following is the first **5** rows of the dataset.
```{r}
dt1[1:5,]
```

#### The following is the **Basic Summary** of the dataset.
```{r}
summary(dt1)
```

#### Transform the dataset.
```{r}
df <- dt1 %>%
      arrange(datetime_y) %>%
      arrange(session_id) %>%
      unique() %>%
      group_by(session_id) %>%
      #summarise(cart=paste(item_id,collapse=";")) %>%
      ungroup()
df[1:5,]
```
Create additional columns for transformation. 
```{r}
df$sequenceID <- df$session_id
df$eventID    <- df$session_id
df$SIZE       <- '1'
df$items      <- df$item_id
df$items      <- as.factor(df$items)
df[1:5,]
```

#### Recode **Event Id** to numeric ascending order.
```{r}
df$eventID <- df$eventID[1] <- 1
for (i in 1:length(df$sequenceID)) {
  
  if (i == 1) {df$eventID[i] <- 1} else
  if( df$sequenceID[i-1] == df$sequenceID[i])
  {
      df$eventID[i] <-df$eventID[i-1]+1 
    
    }
}
```
```{r}
df[1:5,]
```

```{r}
df1 <- df
# Check dummy columns
df1$seq_test <- df1$sequenceID
df1$sequenceID <-df1$sequenceID[1] <- 1

df1[1:5,]
```

#### Recode **Sequence Id** to numeric ascending order.
```{r}
for (i in 1:length(df1$seq_test)) {
  
  if (i == 1) {df$sequenceID[i] <- 1} else 
  if(df1$seq_test[i-1] == df1$seq_test[i]){ df1$sequenceID[i] = df1$sequenceID[i-1] }
  else {df1$sequenceID[i] = df1$sequenceID[i-1]+1}
}
```
```{r}
df1[1:5,]
```

#### The **final sequence format** data is as follows:-
```{r}
df2           <- df1[c(5,6,7,8)]
df2$sequenceID <- as.integer(df2$sequenceID)
df2$eventID   <- as.integer(df2$eventID)
df2$SIZE      <- as.integer(df2$SIZE)
df2 <- df2[order(df2$sequenceID,df2$eventID),]
#seqchkpt1
df2[1:5,]
```

#### Export the data out as **.txt** files and re-construct the **Transaction Basket** file.
```{r}
write.table(df2, "seq_format.txt", sep=" ", row.names = FALSE, col.names = FALSE, quote = FALSE)
data <- read_baskets(con = "seq_format.txt", info = c("sequenceID","eventID","SIZE"))
```

#### Show **Transaction Object** Information
```{r}
 transactionInfo(data)
```

#### Show the **Sequences Rules**.
```{r}
as(head(data), "data.frame")
```

#### Run **CSpade Algorithm**.    
For CSAPDE algoritm you might set some lags so that you can extract rules from sequence of transactions with the lag.   
We set the minimum support of rules to **0.5%**.
```{r}
seqs <- cspade(data, parameter = list(support = 0.0005), control = list(verbose = TRUE))
```

#### View the **Sequences**.
```{r}
as(seqs,"data.frame")  # view the sequences
```

#### Convert extracted sequential rules to data frame and Filter rules with more than one sequence
```{r}
scrul.dt <- as(seqs,"data.frame")
scrul.dt$sequence <- gsub("df3\\$cart2\\=|<|>","",scrul.dt$sequence)

scrul.dt1 <- scrul.dt[count.fields(textConnection(scrul.dt$sequence),sep = ",")>1,]
scrul.dt1
```

```{r}
scrul.dt1[10,]
```
Each of unique sequences happened on the same date. For rule 10,If a customer’s first purchase is 214853102, his second purchase would be 214854840 which is frequent for around 2% of session user.

#### Induced the Sequences Rules.
```{r}
seqrules <- ruleInduction(seqs, confidence = 0.5,control = list(verbose = TRUE))
```

#### The following is the **Sequence Rules with 50% Confidences**.
```{r}
as(seqrules,"data.frame")  # view the rules
```

### **Testing Sequence Rules**    
We first defined the **Working Functions** and load the **Test** dataset
```{r}
#remove duplicate items from a basket (itemstrg)
uniqueitems <- function(itemstrg) {
  unique(as.list(strsplit(gsub(" ","",itemstrg),","))[[1]])
}
# execute ruleset using item as rule antecedent (handles single item antecedents only)
makepreds <- function(item, rulesDF) {
  antecedent = paste("<{",item,"}> =>",sep="") # NOTE: diff from assoc analysis same fn
  firingrules = rulesDF[grep(antecedent, rulesDF$rule,fixed=TRUE),1] # rules is now rule
  #gsub(" ","",toString(sub(">}","",sub(".*=> <{","",firingrules))))
  gsub(" ", "", toString(sub('\\}>', '', sub(".*=> <\\{", "", firingrules))))
}
# count how many predictions are in the basket of items already seen by that user 
# Caution : refers to "baskets" as a global
checkpreds <- function(preds, baskID) {
  plist = preds[[1]]
  blist = baskets[baskets$basketID == baskID,"items"][[1]]
  cnt = 0 
  for (p in plist) {
    if (p %in% blist) cnt = cnt+1
  }
  cnt
}
# count all predictions made
countpreds <- function(predlist) {
  len = length(predlist)
  if (len > 0 && (predlist[[1]] == "")) 0 # avoid counting an empty list
  else len
}
# Load the test data
testegs = read.csv(file="test.csv")
testegs = testegs[,c(1,3)]
colnames(testegs) <- c("basketID","items")  # set standard names, in case they are different in the data file
# Display top 5 rows
testegs[1:5,]
```

```{r}
#execute rules against test data
rulesDF = as(seqrules,"data.frame")
testegs$preds = apply(testegs,1,function(X) makepreds(X["items"], rulesDF))

# extract unique predictions for each test user
userpreds = as.data.frame(aggregate(preds ~ basketID, data = testegs, paste, collapse=","))
userpreds$preds = apply(userpreds,1,function(X) uniqueitems(X["preds"]))

# extract unique items for each test user
baskets = as.data.frame(aggregate(items ~ basketID, data = testegs, paste, collapse=","))
baskets$items = apply(baskets,1,function(X) uniqueitems(X["items"]))

#count how many unique predictions made are correct, i.e. have previously been bought (or rated highly) by the user
correctpreds = sum(apply(userpreds,1,function(X) checkpreds(X["preds"],X["basketID"])))

# count total number of unique predictions made
totalpreds = sum(apply(userpreds,1,function(X) countpreds(X["preds"][[1]]))) 
precision = correctpreds*100/totalpreds
cat("precision=", precision, "corr=",correctpreds,"total=",totalpreds)
```
